LAGOS Analysis

Loading in data

First download and then specifically grab the locus (or site lat longs)

#Lagos download script
#lagosne_get(dest_folder = LAGOSNE:::lagos_path(),overwrite=T)

#Load in lagos
lagos <- lagosne_load()
## Warning in (function (version = NULL, fpath = NA) : LAGOSNE version unspecified,
## loading version: 1.087.3
#Grab the lake centroid info
lake_centers <- lagos$locus

# Make an sf object 
spatial_lakes <- st_as_sf(lake_centers,coords=c('nhd_long','nhd_lat'),
                          crs=4326)

#Grab the water quality data
nutr <- lagos$epi_nutr

#Look at column names
#names(nutr)

Subset columns nutr to only keep key info that we want

clarity_only <- nutr %>%
  select(lagoslakeid,sampledate,chla,doc,secchi) %>%
  mutate(sampledate = as.character(sampledate) %>% ymd(.))

Keep sites with at least 200 observations

#Look at the number of rows of dataset
#nrow(clarity_only)

chla_secchi <- clarity_only %>%
  filter(!is.na(chla),
         !is.na(secchi))

# How many observatiosn did we lose?
filteredObservations = nrow(clarity_only) - nrow(chla_secchi)


# Keep only the lakes with at least 200 observations of secchi and chla
chla_secchi_200 <- chla_secchi %>%
  group_by(lagoslakeid) %>%
  mutate(count = n()) %>%
  filter(count > 200)

We lost 651095 observations because they were missing Secchi or Chlorophyll data.

Join water quality data to spatial data

spatial_200 <- inner_join(spatial_lakes,chla_secchi_200 %>%
                            distinct(lagoslakeid,.keep_all=T),
                          by='lagoslakeid')

Mean Chl_a map

### Take the mean chl_a and secchi by lake

mean_values_200 <- chla_secchi_200 %>%
  # Take summary by lake id
  group_by(lagoslakeid) %>%
  # take mean chl_a per lake id
  summarize(mean_chla = mean(chla,na.rm=T),
            mean_secchi=mean(secchi,na.rm=T)) %>%
  #Get rid of NAs
  filter(!is.na(mean_chla),
         !is.na(mean_secchi)) %>%
  # Take the log base 10 of the mean_chl
  mutate(log10_mean_chla = log10(mean_chla))

#Join datasets
mean_spatial <- inner_join(spatial_lakes,mean_values_200,
                          by='lagoslakeid') 

#Make a map
mapview(mean_spatial,zcol='log10_mean_chla')

Class work

1) What is the correlation between Secchi Disk Depth and Chlorophyll a for sites with at least 200 observations?

Here, I just want a plot of chla vs secchi for all sites

#Your code here
ggplot(mean_values_200) +
  geom_point(aes( mean_secchi, mean_chla))

ggplot(chla_secchi) +
  geom_point(aes( secchi, chla))

Why might this be the case?

Chlorophyll is used for photosynthesis. Light is required as an input. As depth below the surface increases light decreases. Additionally deeper lakes tend to produce less photosynthetic allgy due to increased mechanical mixing of diffrent water layers.

2 What states have the most data?

2a First you will need to make a lagos spatial dataset that has the total number of counts per site.

## Get count for each lake
lago_summary = chla_secchi %>%
  #slice(1:10000) %>%
  group_by(lagoslakeid) %>%
  summarize(
    mean_chla = mean(chla,na.rm=T),
    mean_secchi=mean(secchi,na.rm=T),
    count=n()
  )
## Join to lake location
lago_location_summary = 
  merge(
    x = lago_summary, 
    y = lake_centers, 
    by = "lagoslakeid", 
    all.x = TRUE
  ) %>%
  st_as_sf(coords=c('nhd_long','nhd_lat'),crs=4326)
mapview(lago_location_summary)

2b Second, you will need to join this point dataset to the us_boundaries data.

state_bounds = us_states() %>%
    select(state_name,state_abbr)

lago_location_summary_join_state = 
  st_join(
    x = lago_location_summary, 
    y = state_bounds, 
    left = TRUE
  )

lago_location_summary_join_state_200 = lago_location_summary_join_state %>%
    filter(count > 200)

2c Then you will want to group by state and sum all the observations in that state and arrange that data from most to least total observations per state.

## Your code here. 
state_data = lago_location_summary_join_state %>%
  group_by(state_name) %>%
  summarize(
    mean_chla = mean(mean_chla,na.rm=T),
    mean_secchi=mean(mean_secchi,na.rm=T),
    count=sum(count)
  ) %>%
  arrange(desc(count))

## verify all observations totaled correctly **success**
sum(state_data$count)
## [1] 165000
state_data_200 = lago_location_summary_join_state_200 %>%
  group_by(state_name) %>%
  summarize(
    mean_chla = mean(mean_chla,na.rm=T),
    mean_secchi=mean(mean_secchi,na.rm=T),
    count=sum(count)
  ) %>%
  arrange(desc(count))
### map of where state with most values are
state_obs_count =  st_join(
    x = state_bounds, 
    y = state_data, 
    left = TRUE
  )

mapview(state_obs_count, zcol = 'count')
### map of where state with most values are
state_obs_count_200 =  st_join(
    x = state_bounds, 
    y = state_data_200, 
    left = TRUE
  )

mapview(state_obs_count_200, zcol = 'count')

3 Is there a spatial pattern in Secchi disk depth for lakes with at least 200 observations?

## Your code here

mapview(lago_location_summary_join_state_200, zcol = 'mean_secchi')

The lakes with more than 200 observations are all centered on urban areas. This show more of a bias towards the accesibility rather than a spatial connection with Secchi disk depths.

Why 200 observations

ggplot(lago_location_summary_join_state, aes(x=count)) + 
  geom_histogram() + 
  scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This histogram shows the distribution of observations / lake. The 200 observations per lake is somewhat arbitrary. Looking at 10 to 50 observations per lake would capture more than the outlier lakes with an extreme amount of observation activity.

25 observations per lake

lago_location_summary_join_state_25 = lago_location_summary_join_state %>%
    filter(count > 25)
mapview(lago_location_summary_join_state_25, zcol = 'mean_secchi')

At 25 observations per lake this demonstrates 2 clear spatial relations for Secchi depth:

  1. Increasing latitude is positively correlated with Secchi depth. This could be to decreased agricultural activity in the north.

  2. Increased distance from population centers is positively correclated with Secchi depth. This could be to less human made pollution adding nutrient to the water.